library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.3
library(forecast)
## Warning: package 'forecast' was built under R version 4.3.3
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(fpp3)
## Warning: package 'fpp3' was built under R version 4.3.3
## Registered S3 method overwritten by 'tsibble':
##   method               from 
##   as_tibble.grouped_df dplyr
## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.1 ──
## ✔ tibble      3.2.1     ✔ tsibble     1.1.5
## ✔ dplyr       1.1.3     ✔ tsibbledata 0.4.1
## ✔ tidyr       1.3.0     ✔ feasts      0.4.1
## ✔ lubridate   1.9.2     ✔ fable       0.4.1
## Warning: package 'dplyr' was built under R version 4.3.2
## Warning: package 'tsibble' was built under R version 4.3.3
## Warning: package 'tsibbledata' was built under R version 4.3.3
## Warning: package 'feasts' was built under R version 4.3.3
## Warning: package 'fabletools' was built under R version 4.3.3
## Warning: package 'fable' was built under R version 4.3.3
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date()    masks base::date()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval()  masks lubridate::interval()
## ✖ dplyr::lag()         masks stats::lag()
## ✖ tsibble::setdiff()   masks base::setdiff()
## ✖ tsibble::union()     masks base::union()
library(patchwork)
## Warning: package 'patchwork' was built under R version 4.3.3
library(fable)
library(fabletools)
library(tsibble)
library(dplyr)
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0     ✔ readr   2.1.4
## ✔ purrr   1.0.2     ✔ stringr 1.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter()     masks stats::filter()
## ✖ tsibble::interval() masks lubridate::interval()
## ✖ dplyr::lag()        masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(feasts)
library(lubridate)
library(fable)
library(fabletools)
library(lubridate)
library(broom)
library(glue)

4.3 Use a feature-based approach to look for outlying series in the PBS data. What is unusual about the series you identify as “outliers”.

rm(list = ls())
View(PBS)

A filtered dataset where Cost has variability within each group.

PBS_filtered <- PBS %>%
  group_by(Concession, Type, ATC1, ATC2) %>%
  # Keeps only groups where the variance of Cost is greater than zero (i.e., removes groups where Cost is constant or missing)
  filter(var(Cost, na.rm = TRUE) > 0) %>%
  ungroup()
View(PBS_filtered)

Extract time series features from the Cost column of PBS_filtered using the feasts package and removes missing values.

PBS_feat has one row per group (Concession, Type, ATC1, ATC2).

PBS_feat <- PBS_filtered %>%
  features(Cost, feature_set(pkgs = "feasts")) %>%
  na.omit()
## Warning in optimise(lambda_coef_var, c(lower, upper), x = x, .period =
## max(.period, : NA/Inf replaced by maximum positive value

## Warning in optimise(lambda_coef_var, c(lower, upper), x = x, .period =
## max(.period, : NA/Inf replaced by maximum positive value

## Warning in optimise(lambda_coef_var, c(lower, upper), x = x, .period =
## max(.period, : NA/Inf replaced by maximum positive value

## Warning in optimise(lambda_coef_var, c(lower, upper), x = x, .period =
## max(.period, : NA/Inf replaced by maximum positive value
View(PBS_feat)

Filter PBS_feat to remove rows containing non-finite numeric values (i.e., Inf, -Inf, or NaN).

PBS_feat <- PBS_feat %>%
  filter_if(is.numeric, all_vars(is.finite(.)))

Perform Principal Component Analysis (PCA) on the PBS_feat dataset while retaining metadata for further analysis.

PBS_prcomp <- PBS_feat %>%
  # Remove categorical/grouping columns to retain only numerical features for PCA.
  select(-Concession, -Type, -ATC1, -ATC2) %>%
  # Performs PCA on the selected numeric features, standardizing them (scale = TRUE ensures mean 0 and variance 1).
  prcomp(scale = TRUE) %>%
  # Adds the PCA results (principal component scores) back to the original PBS_feat dataset.
  augment(PBS_feat)
View(PBS_prcomp)
colnames(PBS_prcomp)
##   [1] ".rownames"              "Concession"             "Type"                  
##   [4] "ATC1"                   "ATC2"                   "trend_strength"        
##   [7] "seasonal_strength_year" "seasonal_peak_year"     "seasonal_trough_year"  
##  [10] "spikiness"              "linearity"              "curvature"             
##  [13] "stl_e_acf1"             "stl_e_acf10"            "acf1"                  
##  [16] "acf10"                  "diff1_acf1"             "diff1_acf10"           
##  [19] "diff2_acf1"             "diff2_acf10"            "season_acf1"           
##  [22] "pacf5"                  "diff1_pacf5"            "diff2_pacf5"           
##  [25] "season_pacf"            "zero_run_mean"          "nonzero_squared_cv"    
##  [28] "zero_start_prop"        "zero_end_prop"          "lambda_guerrero"       
##  [31] "kpss_stat"              "kpss_pvalue"            "pp_stat"               
##  [34] "pp_pvalue"              "ndiffs"                 "nsdiffs"               
##  [37] "bp_stat"                "bp_pvalue"              "lb_stat"               
##  [40] "lb_pvalue"              "var_tiled_var"          "var_tiled_mean"        
##  [43] "shift_level_max"        "shift_level_index"      "shift_var_max"         
##  [46] "shift_var_index"        "shift_kl_max"           "shift_kl_index"        
##  [49] "spectral_entropy"       "n_crossing_points"      "longest_flat_spot"     
##  [52] "coef_hurst"             "stat_arch_lm"           ".fittedPC1"            
##  [55] ".fittedPC2"             ".fittedPC3"             ".fittedPC4"            
##  [58] ".fittedPC5"             ".fittedPC6"             ".fittedPC7"            
##  [61] ".fittedPC8"             ".fittedPC9"             ".fittedPC10"           
##  [64] ".fittedPC11"            ".fittedPC12"            ".fittedPC13"           
##  [67] ".fittedPC14"            ".fittedPC15"            ".fittedPC16"           
##  [70] ".fittedPC17"            ".fittedPC18"            ".fittedPC19"           
##  [73] ".fittedPC20"            ".fittedPC21"            ".fittedPC22"           
##  [76] ".fittedPC23"            ".fittedPC24"            ".fittedPC25"           
##  [79] ".fittedPC26"            ".fittedPC27"            ".fittedPC28"           
##  [82] ".fittedPC29"            ".fittedPC30"            ".fittedPC31"           
##  [85] ".fittedPC32"            ".fittedPC33"            ".fittedPC34"           
##  [88] ".fittedPC35"            ".fittedPC36"            ".fittedPC37"           
##  [91] ".fittedPC38"            ".fittedPC39"            ".fittedPC40"           
##  [94] ".fittedPC41"            ".fittedPC42"            ".fittedPC43"           
##  [97] ".fittedPC44"            ".fittedPC45"            ".fittedPC46"           
## [100] ".fittedPC47"            ".fittedPC48"

An interactive PCA scatter plot where hovering over points reveals their corresponding series labels.

p1 <- PBS_prcomp %>%
  # Combines Concession, Type, ATC1, and ATC2 into a single identifier (serie), separated by "-", while keeping original columns.
  unite("serie", Concession:ATC2, sep = "-", remove = FALSE) %>%
  # Plots the first two principal components (.fittedPC1 and .fittedPC2), using serie as labels.
  ggplot(aes(x = .fittedPC1, y = .fittedPC2, label = serie)) +
  geom_point() +
  labs(title = "PCA of PBS Series with PC1 and PC2")
# Converts the ggplot2 plot into an interactive plotly visualization.
plotly::ggplotly(p1)
summary(PBS_prcomp$.fittedPC1)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -7.329  -3.507   1.205   0.000   2.963   9.361

A faceted time series plot highlighting the outlying series in PCA space.

# Select rows from PBS_prcomp where the first principal component (.fittedPC1) is greater than 6, identifying potential outliers.
outliers <- PBS_prcomp %>%
  filter(.fittedPC1 > 6)

print(outliers)
## # A tibble: 4 × 101
##   .rownames Concession   Type  ATC1  ATC2  trend_strength seasonal_strength_year
##   <chr>     <chr>        <chr> <chr> <chr>          <dbl>                  <dbl>
## 1 134       Concessional Safe… J     J06           0.0917                  0.159
## 2 190       General      Co-p… C     C05           0.105                   0.245
## 3 243       General      Co-p… S     S02           0.140                   0.219
## 4 244       General      Co-p… S     S03           0.162                   0.257
## # ℹ 94 more variables: seasonal_peak_year <dbl>, seasonal_trough_year <dbl>,
## #   spikiness <dbl>, linearity <dbl>, curvature <dbl>, stl_e_acf1 <dbl>,
## #   stl_e_acf10 <dbl>, acf1 <dbl>, acf10 <dbl>, diff1_acf1 <dbl>,
## #   diff1_acf10 <dbl>, diff2_acf1 <dbl>, diff2_acf10 <dbl>, season_acf1 <dbl>,
## #   pacf5 <dbl>, diff1_pacf5 <dbl>, diff2_pacf5 <dbl>, season_pacf <dbl>,
## #   zero_run_mean <dbl>, nonzero_squared_cv <dbl>, zero_start_prop <dbl>,
## #   zero_end_prop <dbl>, lambda_guerrero <dbl>, kpss_stat <dbl>, …
PBS %>%
  # Filter PBS to keep only the series corresponding to outliers.
  semi_join(outliers, by = c("Concession", "Type", "ATC1", "ATC2")) %>%
  # Plot the Cost time series of these outliers, faceted by the grouping variables.
  autoplot(Cost) +
  facet_grid(vars(Concession, Type, ATC1, ATC2)) +
  labs(title = "Time Series of Outliers Identified in PCA Space")

Plot for “Concessional - Safety Net - J - J06”

PBS %>%
  semi_join(outliers, by = c("Concession", "Type", "ATC1", "ATC2")) %>%
  filter(Concession == "Concessional", Type == "Safety net", ATC1 == "J", ATC2 == "J06") %>%
  autoplot(Cost) +
  labs(title = "Time Series of Outlier: Concessional - Safety Net - J - J06")

Plot for “General - Co-payments - C - C05”

PBS %>%
  semi_join(outliers, by = c("Concession", "Type", "ATC1", "ATC2")) %>%
  filter(Concession == "General", Type == "Co-payments", ATC1 == "C", ATC2 == "C05") %>%
  autoplot(Cost) +
  labs(title = "Time Series of Outlier: General - Co-payments - C - C05")

Plot for “General - Co-payments - S - S02”

PBS %>%
  semi_join(outliers, by = c("Concession", "Type", "ATC1", "ATC2")) %>%
  filter(Concession == "General", Type == "Co-payments", ATC1 == "S", ATC2 == "S02") %>%
  autoplot(Cost) +
  labs(title = "Time Series of Outlier: General - Co-payments - S - S02")

Plot for “General - Co-payments - S - S03”

PBS %>%
  semi_join(outliers, by = c("Concession", "Type", "ATC1", "ATC2")) %>%
  filter(Concession == "General", Type == "Co-payments", ATC1 == "S", ATC2 == "S03") %>%
  autoplot(Cost) +
  labs(title = "Time Series of Outlier: General - Co-payments - S - S03")

Observations

Faceted Series:

  • Each panel represents a time series identified as an outlier based on PCA results.
  • The facets display metadata about each series, including:
    • Concession: Specifies whether the series falls under the “Concessional” or “General” category.
    • Type: Denotes the payment type (e.g., “Co-payments”, “Safety Net”).
    • ATC1/ATC2: Classifies the drug based on its therapeutic or chemical grouping.

Outlying Series:

The unusual characteristics of the identified outlier series are:

  1. J06 - Concessional/Safety Net:
    • The series exhibits sharp, periodic spikes with long periods of inactivity.
    • This could indicate batch processing, irregular funding, or missing data between spikes.
  2. C05 - General/Co-payments:
    • A massive, sudden spike near the end of the timeline, followed by a return to zero.
    • This suggests a one-time event such as a bulk purchase, policy shift, or a data recording issue.
  3. S02 - General/Co-payments:
    • Irregular fluctuations throughout, with a sudden surge at the end.
    • The pattern suggests a gradual increase in costs over time with a late-stage anomaly.
  4. S03 - General/Co-payments:
    • Mostly stable with occasional sharp peaks followed by inactivity.
    • This pattern suggests sporadic demand or intermittent funding cycles.

Overall Observations:

  • Most of these outliers have sudden, extreme spikes or long periods of inactivity.
  • They could indicate irregular purchases, funding policies, reporting errors, or anomalies in drug pricing/demand.